R Markdown

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install(version = "3.11")
## Bioconductor version 3.11 (BiocManager 1.30.10), R 4.0.3 (2020-10-10)
if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install("rnaseqGene")
## Bioconductor version 3.11 (BiocManager 1.30.10), R 4.0.3 (2020-10-10)
## Installing package(s) 'rnaseqGene'
## installing the source package 'rnaseqGene'
# I also needed to install the following on my computer 
# (You will realize this if you get an error message that the corresponding library will not load)

BiocManager::install("airway")
## Bioconductor version 3.11 (BiocManager 1.30.10), R 4.0.3 (2020-10-10)
## Installing package(s) 'airway'
## installing the source package 'airway'
BiocManager::install("tximeta")
## Bioconductor version 3.11 (BiocManager 1.30.10), R 4.0.3 (2020-10-10)
## Installing package(s) 'tximeta'
## 
## The downloaded binary packages are in
##  /var/folders/_8/93ymbwtn3t179r6g59lnt6sh0000gn/T//Rtmpu11DuY/downloaded_packages
BiocManager::install("DESeq2")
## Bioconductor version 3.11 (BiocManager 1.30.10), R 4.0.3 (2020-10-10)
## Installing package(s) 'DESeq2'
## 
## The downloaded binary packages are in
##  /var/folders/_8/93ymbwtn3t179r6g59lnt6sh0000gn/T//Rtmpu11DuY/downloaded_packages
BiocManager::install("Gviz")
## Bioconductor version 3.11 (BiocManager 1.30.10), R 4.0.3 (2020-10-10)
## Installing package(s) 'Gviz'
## 
## The downloaded binary packages are in
##  /var/folders/_8/93ymbwtn3t179r6g59lnt6sh0000gn/T//Rtmpu11DuY/downloaded_packages
BiocManager::install("sva")
## Bioconductor version 3.11 (BiocManager 1.30.10), R 4.0.3 (2020-10-10)
## Installing package(s) 'sva'
## 
## The downloaded binary packages are in
##  /var/folders/_8/93ymbwtn3t179r6g59lnt6sh0000gn/T//Rtmpu11DuY/downloaded_packages
BiocManager::install("RUVSeq")
## Bioconductor version 3.11 (BiocManager 1.30.10), R 4.0.3 (2020-10-10)
## Installing package(s) 'RUVSeq'
## 
## The downloaded binary packages are in
##  /var/folders/_8/93ymbwtn3t179r6g59lnt6sh0000gn/T//Rtmpu11DuY/downloaded_packages
BiocManager::install("fission")
## Bioconductor version 3.11 (BiocManager 1.30.10), R 4.0.3 (2020-10-10)
## Installing package(s) 'fission'
## installing the source package 'fission'
library("airway")
## Loading required package: SummarizedExperiment
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which, which.max, which.min
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: DelayedArray
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians
## 
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
## 
##     colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following objects are masked from 'package:base':
## 
##     aperm, apply, rowsum
dir <- system.file("extdata", package="airway", mustWork=TRUE)
list.files(dir)
##  [1] "GSE52778_series_matrix.txt"        "Homo_sapiens.GRCh37.75_subset.gtf"
##  [3] "quants"                            "sample_table.csv"                 
##  [5] "SraRunInfo_SRP033351.csv"          "SRR1039508_subset.bam"            
##  [7] "SRR1039509_subset.bam"             "SRR1039512_subset.bam"            
##  [9] "SRR1039513_subset.bam"             "SRR1039516_subset.bam"            
## [11] "SRR1039517_subset.bam"             "SRR1039520_subset.bam"            
## [13] "SRR1039521_subset.bam"
list.files(file.path(dir, "quants"))
## [1] "SRR1039508" "SRR1039509"
csvfile <- file.path(dir, "sample_table.csv")
coldata <- read.csv(csvfile, row.names=1, stringsAsFactors=FALSE)
coldata
##            SampleName    cell   dex albut        Run avgLength Experiment
## SRR1039508 GSM1275862  N61311 untrt untrt SRR1039508       126  SRX384345
## SRR1039509 GSM1275863  N61311   trt untrt SRR1039509       126  SRX384346
## SRR1039512 GSM1275866 N052611 untrt untrt SRR1039512       126  SRX384349
## SRR1039513 GSM1275867 N052611   trt untrt SRR1039513        87  SRX384350
## SRR1039516 GSM1275870 N080611 untrt untrt SRR1039516       120  SRX384353
## SRR1039517 GSM1275871 N080611   trt untrt SRR1039517       126  SRX384354
## SRR1039520 GSM1275874 N061011 untrt untrt SRR1039520       101  SRX384357
## SRR1039521 GSM1275875 N061011   trt untrt SRR1039521        98  SRX384358
##               Sample    BioSample
## SRR1039508 SRS508568 SAMN02422669
## SRR1039509 SRS508567 SAMN02422675
## SRR1039512 SRS508571 SAMN02422678
## SRR1039513 SRS508572 SAMN02422670
## SRR1039516 SRS508575 SAMN02422682
## SRR1039517 SRS508576 SAMN02422673
## SRR1039520 SRS508579 SAMN02422683
## SRR1039521 SRS508580 SAMN02422677
coldata <- coldata[1:2,]
coldata$names <- coldata$Run
coldata$files <- file.path(dir, "quants", coldata$names, "quant.sf.gz")
file.exists(coldata$files)
## [1] TRUE TRUE
library("tximeta")
se <- tximeta(coldata)
## importing quantifications
## reading in files with read_tsv
## 1 2 
## found matching transcriptome:
## [ GENCODE - Homo sapiens - release 29 ]
## loading existing TxDb created: 2020-10-19 01:20:54
## Loading required package: GenomicFeatures
## Loading required package: AnnotationDbi
## loading existing transcript ranges created: 2020-10-19 01:20:56
## fetching genome info for GENCODE
dim(se)
## [1] 205870      2
head(rownames(se))
## [1] "ENST00000456328.2" "ENST00000450305.2" "ENST00000488147.1"
## [4] "ENST00000619216.1" "ENST00000473358.1" "ENST00000469289.1"
gse <- summarizeToGene(se)
## loading existing TxDb created: 2020-10-19 01:20:54
## obtaining transcript-to-gene mapping from database
## loading existing gene ranges created: 2020-10-19 01:28:23
## summarizing abundance
## summarizing counts
## summarizing length
dim(gse)
## [1] 58294     2
head(rownames(gse))
## [1] "ENSG00000000003.14" "ENSG00000000005.5"  "ENSG00000000419.12"
## [4] "ENSG00000000457.13" "ENSG00000000460.16" "ENSG00000000938.12"
data(gse)
gse
## class: RangedSummarizedExperiment 
## dim: 58294 8 
## metadata(6): tximetaInfo quantInfo ... txomeInfo txdbInfo
## assays(3): counts abundance length
## rownames(58294): ENSG00000000003.14 ENSG00000000005.5 ...
##   ENSG00000285993.1 ENSG00000285994.1
## rowData names(1): gene_id
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(3): names donor condition
assayNames(gse)
## [1] "counts"    "abundance" "length"
head(assay(gse), 3)
##                    SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003.14    708.164    467.962    900.992    424.368   1188.295
## ENSG00000000005.5       0.000      0.000      0.000      0.000      0.000
## ENSG00000000419.12    455.000    510.000    604.000    352.000    583.000
##                    SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003.14   1090.668    805.929    599.337
## ENSG00000000005.5       0.000      0.000      0.000
## ENSG00000000419.12    773.999    409.999    499.000
colSums(assay(gse))
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517 SRR1039520 
##   21100805   19298584   26145537   15688246   25268618   31891456   19683767 
## SRR1039521 
##   21813903
rowRanges(gse)
## GRanges object with 58294 ranges and 1 metadata column:
##                      seqnames              ranges strand |            gene_id
##                         <Rle>           <IRanges>  <Rle> |        <character>
##   ENSG00000000003.14     chrX 100627109-100639991      - | ENSG00000000003.14
##    ENSG00000000005.5     chrX 100584802-100599885      + |  ENSG00000000005.5
##   ENSG00000000419.12    chr20   50934867-50958555      - | ENSG00000000419.12
##   ENSG00000000457.13     chr1 169849631-169894267      - | ENSG00000000457.13
##   ENSG00000000460.16     chr1 169662007-169854080      + | ENSG00000000460.16
##                  ...      ...                 ...    ... .                ...
##    ENSG00000285990.1    chr14   19244904-19269380      - |  ENSG00000285990.1
##    ENSG00000285991.1     chr6 149817937-149896011      - |  ENSG00000285991.1
##    ENSG00000285992.1     chr8   47129262-47132628      + |  ENSG00000285992.1
##    ENSG00000285993.1    chr18   46409197-46410645      - |  ENSG00000285993.1
##    ENSG00000285994.1    chr10   12563151-12567351      + |  ENSG00000285994.1
##   -------
##   seqinfo: 25 sequences (1 circular) from hg38 genome
seqinfo(rowRanges(gse))
## Seqinfo object with 25 sequences (1 circular) from hg38 genome:
##   seqnames seqlengths isCircular genome
##   chr1      248956422      FALSE   hg38
##   chr2      242193529      FALSE   hg38
##   chr3      198295559      FALSE   hg38
##   chr4      190214555      FALSE   hg38
##   chr5      181538259      FALSE   hg38
##   ...             ...        ...    ...
##   chr21      46709983      FALSE   hg38
##   chr22      50818468      FALSE   hg38
##   chrX      156040895      FALSE   hg38
##   chrY       57227415      FALSE   hg38
##   chrM          16569       TRUE   hg38
colData(gse)
## DataFrame with 8 rows and 3 columns
##                 names    donor     condition
##              <factor> <factor>      <factor>
## SRR1039508 SRR1039508  N61311  Untreated    
## SRR1039509 SRR1039509  N61311  Dexamethasone
## SRR1039512 SRR1039512  N052611 Untreated    
## SRR1039513 SRR1039513  N052611 Dexamethasone
## SRR1039516 SRR1039516  N080611 Untreated    
## SRR1039517 SRR1039517  N080611 Dexamethasone
## SRR1039520 SRR1039520  N061011 Untreated    
## SRR1039521 SRR1039521  N061011 Dexamethasone
gse$donor
## [1] N61311  N61311  N052611 N052611 N080611 N080611 N061011 N061011
## Levels: N052611 N061011 N080611 N61311
gse$condition
## [1] Untreated     Dexamethasone Untreated     Dexamethasone Untreated    
## [6] Dexamethasone Untreated     Dexamethasone
## Levels: Untreated Dexamethasone
gse$cell <- gse$donor
gse$dex <- gse$condition
levels(gse$dex)
## [1] "Untreated"     "Dexamethasone"
# when renaming levels, the order must be preserved!
levels(gse$dex) <- c("untrt", "trt")

it is prefered in R that the first level of a factor be the reference level (e.g. control, or untreated samples).

library("magrittr")
gse$dex %<>% relevel("untrt")
gse$dex
## [1] untrt trt   untrt trt   untrt trt   untrt trt  
## Levels: untrt trt

%<>% is the compound assignment pipe-operator from the magrittr package, the above line of code is a concise way of saying:

gse$dex <- relevel(gse$dex, "untrt")

the second argument of round tells how many decimal points to keep

round( colSums(assay(gse)) / 1e6, 1 )
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517 SRR1039520 
##       21.1       19.3       26.1       15.7       25.3       31.9       19.7 
## SRR1039521 
##       21.8
library("DESeq2")
dds <- DESeqDataSet(gse, design = ~ cell + dex)
## using counts and average transcript lengths from tximeta
countdata <- round(assays(gse)[["counts"]])
head(countdata, 3)
##                    SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003.14        708        468        901        424       1188
## ENSG00000000005.5           0          0          0          0          0
## ENSG00000000419.12        455        510        604        352        583
##                    SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003.14       1091        806        599
## ENSG00000000005.5           0          0          0
## ENSG00000000419.12        774        410        499

If you’ve imported the count data in some other way, for example loading a pre-computed count matrix, it is very important to check manually that the columns of the count matrix correspond to the rows of the sample information table.

coldata <- colData(gse)
ddsMat <- DESeqDataSetFromMatrix(countData = countdata,
                                 colData = coldata,
                                 design = ~ cell + dex)
## converting counts to integer mode
nrow(dds)
## [1] 58294
keep <- rowSums(counts(dds)) > 1
dds <- dds[keep,]
nrow(dds)
## [1] 31604
# at least 3 samples with a count of 10 or higher
keep <- rowSums(counts(dds) >= 10) >= 3
lambda <- 10^seq(from = -1, to = 2, length = 1000)
cts <- matrix(rpois(1000*100, lambda), ncol = 100)
library("vsn")
meanSdPlot(cts, ranks = FALSE)

log.cts.one <- log2(cts + 1)
meanSdPlot(log.cts.one, ranks = FALSE)

vsd <- vst(dds, blind = FALSE)
## using 'avgTxLength' from assays(dds), correcting for library size
head(assay(vsd), 3)
##                    SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003.14  10.105781   9.852029  10.169726   9.991545  10.424865
## ENSG00000000419.12   9.692244   9.923647   9.801921   9.798653   9.763455
## ENSG00000000457.13   9.449592   9.312186   9.362754   9.459168   9.281415
##                    SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003.14  10.194490  10.315814  10.002177
## ENSG00000000419.12   9.874703   9.683211   9.845507
## ENSG00000000457.13   9.395937   9.477971   9.477027
colData(vsd)
## DataFrame with 8 rows and 5 columns
##                 names    donor     condition     cell      dex
##              <factor> <factor>      <factor> <factor> <factor>
## SRR1039508 SRR1039508  N61311  Untreated      N61311     untrt
## SRR1039509 SRR1039509  N61311  Dexamethasone  N61311     trt  
## SRR1039512 SRR1039512  N052611 Untreated      N052611    untrt
## SRR1039513 SRR1039513  N052611 Dexamethasone  N052611    trt  
## SRR1039516 SRR1039516  N080611 Untreated      N080611    untrt
## SRR1039517 SRR1039517  N080611 Dexamethasone  N080611    trt  
## SRR1039520 SRR1039520  N061011 Untreated      N061011    untrt
## SRR1039521 SRR1039521  N061011 Dexamethasone  N061011    trt
rld <- rlog(dds, blind = FALSE)
## using 'avgTxLength' from assays(dds), correcting for library size
head(assay(rld), 3)
##                    SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003.14   9.482613   9.172197   9.558383   9.346001   9.851349
## ENSG00000000419.12   8.860186   9.150196   9.000042   8.995902   8.951327
## ENSG00000000457.13   8.354790   8.166700   8.236582   8.366693   8.121781
##                    SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003.14   9.587602   9.727248   9.357876
## ENSG00000000419.12   9.091075   8.848782   9.054384
## ENSG00000000457.13   8.282307   8.392384   8.391023

In the above function calls, we specified blind = FALSE, which means that differences between cell lines and treatment (the variables in the design) will not contribute to the expected variance-mean trend of the experiment.

library("dplyr")
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:AnnotationDbi':
## 
##     select
## The following object is masked from 'package:matrixStats':
## 
##     count
## The following object is masked from 'package:Biobase':
## 
##     combine
## The following objects are masked from 'package:GenomicRanges':
## 
##     intersect, setdiff, union
## The following object is masked from 'package:GenomeInfoDb':
## 
##     intersect
## The following objects are masked from 'package:IRanges':
## 
##     collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
## 
##     first, intersect, rename, setdiff, setequal, union
## The following objects are masked from 'package:BiocGenerics':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library("ggplot2")

dds <- estimateSizeFactors(dds)
## using 'avgTxLength' from assays(dds), correcting for library size
df <- bind_rows(
  as_data_frame(log2(counts(dds, normalized=TRUE)[, 1:2]+1)) %>%
         mutate(transformation = "log2(x + 1)"),
  as_data_frame(assay(vsd)[, 1:2]) %>% mutate(transformation = "vst"),
  as_data_frame(assay(rld)[, 1:2]) %>% mutate(transformation = "rlog"))
## Warning: `as_data_frame()` is deprecated as of tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
colnames(df)[1:2] <- c("x", "y")  

lvls <- c("log2(x + 1)", "vst", "rlog")
df$transformation <- factor(df$transformation, levels=lvls)

ggplot(df, aes(x = x, y = y)) + geom_hex(bins = 80) +
  coord_fixed() + facet_grid( . ~ transformation)  

sampleDists <- dist(t(assay(vsd)))
sampleDists
##            SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517
## SRR1039509   39.42362                                                       
## SRR1039512   32.37620   44.93748                                            
## SRR1039513   51.09677   37.18799   41.79886                                 
## SRR1039516   35.59642   47.54671   34.83458   52.05265                      
## SRR1039517   51.26314   41.58572   46.89609   40.67315   39.74268           
## SRR1039520   32.38578   46.96000   30.35980   48.08669   37.07106   50.38349
## SRR1039521   51.49108   37.57383   47.17283   31.45899   52.62276   41.35941
##            SRR1039520
## SRR1039509           
## SRR1039512           
## SRR1039513           
## SRR1039516           
## SRR1039517           
## SRR1039520           
## SRR1039521   43.01502
library("pheatmap")
library("RColorBrewer")
sampleDistMatrix <- as.matrix( sampleDists )
rownames(sampleDistMatrix) <- paste( vsd$dex, vsd$cell, sep = " - " )
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
         clustering_distance_rows = sampleDists,
         clustering_distance_cols = sampleDists,
         col = colors)

Note that we have changed the row names of the distance matrix to contain treatment type and patient number instead of sample ID, so that we have all this information in view when looking at the heatmap.

library("PoiClaClu")
poisd <- PoissonDistance(t(counts(dds)))
samplePoisDistMatrix <- as.matrix( poisd$dd )
rownames(samplePoisDistMatrix) <- paste( dds$dex, dds$cell, sep=" - " )
colnames(samplePoisDistMatrix) <- NULL
pheatmap(samplePoisDistMatrix,
         clustering_distance_rows = poisd$dd,
         clustering_distance_cols = poisd$dd,
         col = colors)

Another way to visualize sample-to-sample distances is a principal components analysis (PCA). In this ordination method, the data points (here, the samples) are projected onto the 2D plane such that they spread out in the two directions that explain most of the differences

plotPCA(vsd, intgroup = c("dex", "cell"))

pcaData <- plotPCA(vsd, intgroup = c( "dex", "cell"), returnData = TRUE)
pcaData
##                   PC1        PC2         group   dex    cell       name
## SRR1039508 -14.311369 -2.6000421  untrt:N61311 untrt  N61311 SRR1039508
## SRR1039509   8.058574 -0.7500532    trt:N61311   trt  N61311 SRR1039509
## SRR1039512  -9.404122 -4.3920761 untrt:N052611 untrt N052611 SRR1039512
## SRR1039513  14.497842 -4.1323833   trt:N052611   trt N052611 SRR1039513
## SRR1039516 -12.365055 11.2109581 untrt:N080611 untrt N080611 SRR1039516
## SRR1039517   9.343946 14.9115160   trt:N080611   trt N080611 SRR1039517
## SRR1039520 -10.852633 -7.7618618 untrt:N061011 untrt N061011 SRR1039520
## SRR1039521  15.032816 -6.4860576   trt:N061011   trt N061011 SRR1039521
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(x = PC1, y = PC2, color = dex, shape = cell)) +
  geom_point(size =3) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  coord_fixed() +
  ggtitle("PCA with VST data")

### Here we specify cell line (plotting symbol) and dexamethasone treatment (color).

library("glmpca")
gpca <- glmpca(counts(dds), L=2)
gpca.dat <- gpca$factors
gpca.dat$dex <- dds$dex
gpca.dat$cell <- dds$cell
ggplot(gpca.dat, aes(x = dim1, y = dim2, color = dex, shape = cell)) +
  geom_point(size =3) + coord_fixed() + ggtitle("glmpca - Generalized PCA")

mds <- as.data.frame(colData(vsd))  %>%
         cbind(cmdscale(sampleDistMatrix))
ggplot(mds, aes(x = `1`, y = `2`, color = dex, shape = cell)) +
  geom_point(size = 3) + coord_fixed() + ggtitle("MDS with VST data")

mdsPois <- as.data.frame(colData(dds)) %>%
   cbind(cmdscale(samplePoisDistMatrix))
ggplot(mdsPois, aes(x = `1`, y = `2`, color = dex, shape = cell)) +
  geom_point(size = 3) + coord_fixed() + ggtitle("MDS with PoissonDistances")

dds <- DESeq(dds)
## using pre-existing normalization factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res <- results(dds)
res
## log2 fold change (MLE): dex trt vs untrt 
## Wald test p-value: dex trt vs untrt 
## DataFrame with 31604 rows and 6 columns
##                      baseMean log2FoldChange     lfcSE      stat      pvalue
##                     <numeric>      <numeric> <numeric> <numeric>   <numeric>
## ENSG00000000003.14 739.940717     -0.3611537  0.106869 -3.379419 0.000726392
## ENSG00000000419.12 511.735722      0.2063147  0.128665  1.603509 0.108822318
## ENSG00000000457.13 314.194855      0.0378308  0.158633  0.238479 0.811509461
## ENSG00000000460.16  79.793622     -0.1152590  0.314991 -0.365912 0.714430444
## ENSG00000000938.12   0.307267     -1.3691185  3.503764 -0.390757 0.695977205
## ...                       ...            ...       ...       ...         ...
## ENSG00000285979.1   38.353886      0.3423657  0.359511  0.952310    0.340940
## ENSG00000285987.1    1.562508      0.7064145  1.547295  0.456548    0.647996
## ENSG00000285990.1    0.642315      0.3647333  3.433276  0.106235    0.915396
## ENSG00000285991.1   11.276284     -0.1165515  0.748601 -0.155692    0.876275
## ENSG00000285994.1    3.651041     -0.0960094  1.068660 -0.089841    0.928414
##                          padj
##                     <numeric>
## ENSG00000000003.14 0.00531137
## ENSG00000000419.12 0.29318870
## ENSG00000000457.13 0.92255697
## ENSG00000000460.16 0.87298038
## ENSG00000000938.12         NA
## ...                       ...
## ENSG00000285979.1    0.600750
## ENSG00000285987.1          NA
## ENSG00000285990.1          NA
## ENSG00000285991.1    0.952921
## ENSG00000285994.1          NA
res <- results(dds, contrast=c("dex","trt","untrt"))
mcols(res, use.names = TRUE)
## DataFrame with 6 rows and 2 columns
##                        type                               description
##                 <character>                               <character>
## baseMean       intermediate mean of normalized counts for all samples
## log2FoldChange      results  log2 fold change (MLE): dex trt vs untrt
## lfcSE               results          standard error: dex trt vs untrt
## stat                results          Wald statistic: dex trt vs untrt
## pvalue              results       Wald test p-value: dex trt vs untrt
## padj                results                      BH adjusted p-values
summary(res)
## 
## out of 31604 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 2373, 7.5%
## LFC < 0 (down)     : 1949, 6.2%
## outliers [1]       : 0, 0%
## low counts [2]     : 14706, 47%
## (mean count < 9)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

If we lower the false discovery rate threshold, we should also inform the results() function about it, so that the function can use this threshold for the optimal independent filtering that it performs:

res.05 <- results(dds, alpha = 0.05)
table(res.05$padj < 0.05)
## 
## FALSE  TRUE 
## 13357  3541
resLFC1 <- results(dds, lfcThreshold=1)
table(resLFC1$padj < 0.1)
## 
## FALSE  TRUE 
## 16687   211

In general, the results for a comparison of any two levels of a variable can be extracted using the contrast argument to results.

results(dds, contrast = c("cell", "N061011", "N61311"))
## log2 fold change (MLE): cell N061011 vs N61311 
## Wald test p-value: cell N061011 vs N61311 
## DataFrame with 31604 rows and 6 columns
##                      baseMean log2FoldChange     lfcSE       stat    pvalue
##                     <numeric>      <numeric> <numeric>  <numeric> <numeric>
## ENSG00000000003.14 739.940717       0.270945  0.152171   1.780534 0.0749886
## ENSG00000000419.12 511.735722      -0.071831  0.182817  -0.392912 0.6943842
## ENSG00000000457.13 314.194855       0.179881  0.225122   0.799036 0.4242696
## ENSG00000000460.16  79.793622      -0.119482  0.441594  -0.270570 0.7867217
## ENSG00000000938.12   0.307267       0.000000  4.997580   0.000000 1.0000000
## ...                       ...            ...       ...        ...       ...
## ENSG00000285979.1   38.353886      0.0589757  0.512391  0.1150989  0.908367
## ENSG00000285987.1    1.562508      1.0216804  2.201861  0.4640078  0.642642
## ENSG00000285990.1    0.642315     -3.0956404  4.852715 -0.6379193  0.523526
## ENSG00000285991.1   11.276284     -0.8779628  1.046963 -0.8385804  0.401705
## ENSG00000285994.1    3.651041     -0.0192351  1.513236 -0.0127112  0.989858
##                         padj
##                    <numeric>
## ENSG00000000003.14  0.378828
## ENSG00000000419.12  0.936703
## ENSG00000000457.13  0.820733
## ENSG00000000460.16  0.960662
## ENSG00000000938.12        NA
## ...                      ...
## ENSG00000285979.1    0.98371
## ENSG00000285987.1         NA
## ENSG00000285990.1         NA
## ENSG00000285991.1         NA
## ENSG00000285994.1         NA
sum(res$pvalue < 0.05, na.rm=TRUE)
## [1] 5170
sum(!is.na(res$pvalue))
## [1] 31604
sum(res$pvalue < 0.05, na.rm=TRUE)
## [1] 5170
sum(!is.na(res$pvalue))
## [1] 31604
sum(res$padj < 0.1, na.rm=TRUE)
## [1] 4322

We subset the results table to these genes and then sort it by the log2 fold change estimate to get the significant genes with the strongest down-regulation:

resSig <- subset(res, padj < 0.1)
head(resSig[ order(resSig$log2FoldChange), ])
## log2 fold change (MLE): dex trt vs untrt 
## Wald test p-value: dex trt vs untrt 
## DataFrame with 6 rows and 6 columns
##                    baseMean log2FoldChange     lfcSE      stat      pvalue
##                   <numeric>      <numeric> <numeric> <numeric>   <numeric>
## ENSG00000216490.3   42.3007       -5.72483  1.475652  -3.87952 1.04661e-04
## ENSG00000267339.5   30.5206       -5.39781  0.773017  -6.98278 2.89390e-12
## ENSG00000257542.5   10.0399       -5.25991  1.282001  -4.10289 4.08015e-05
## ENSG00000146006.7   61.6448       -4.49504  0.663821  -6.77147 1.27484e-11
## ENSG00000108700.4   14.6324       -4.09069  0.941842  -4.34328 1.40369e-05
## ENSG00000213240.8   12.0962       -3.87313  1.274133  -3.03981 2.36725e-03
##                          padj
##                     <numeric>
## ENSG00000216490.3 9.87853e-04
## ENSG00000267339.5 9.45863e-11
## ENSG00000257542.5 4.30646e-04
## ENSG00000146006.7 3.82632e-10
## ENSG00000108700.4 1.66687e-04
## ENSG00000213240.8 1.44987e-02

strongest up regulation

head(resSig[ order(resSig$log2FoldChange, decreasing = TRUE), ])
## log2 fold change (MLE): dex trt vs untrt 
## Wald test p-value: dex trt vs untrt 
## DataFrame with 6 rows and 6 columns
##                     baseMean log2FoldChange     lfcSE      stat      pvalue
##                    <numeric>      <numeric> <numeric> <numeric>   <numeric>
## ENSG00000254692.1    62.2302       10.20714   3.37706   3.02250 2.50700e-03
## ENSG00000179593.15   67.0895        9.50515   1.07705   8.82513 1.09334e-18
## ENSG00000268173.3    46.4370        8.40438   3.38506   2.48279 1.30358e-02
## ENSG00000224712.12   35.5607        7.16686   2.16476   3.31070 9.30628e-04
## ENSG00000109906.13  438.1940        6.37750   0.31381  20.32276 8.08934e-92
## ENSG00000257663.1    24.3946        6.34758   2.09531   3.02942 2.45024e-03
##                           padj
##                      <numeric>
## ENSG00000254692.1  1.52167e-02
## ENSG00000179593.15 6.81746e-17
## ENSG00000268173.3  5.94385e-02
## ENSG00000224712.12 6.55240e-03
## ENSG00000109906.13 1.24267e-88
## ENSG00000257663.1  1.49151e-02

A quick way to visualize the counts for a particular gene is to use the plotCounts function that takes as arguments the DESeqDataSet, a gene name, and the group over which to plot the counts (figure below).

topGene <- rownames(res)[which.min(res$padj)]
plotCounts(dds, gene = topGene, intgroup=c("dex"))

library("ggbeeswarm")
geneCounts <- plotCounts(dds, gene = topGene, intgroup = c("dex","cell"),
                         returnData = TRUE)
ggplot(geneCounts, aes(x = dex, y = count, color = cell)) +
  scale_y_log10() +  geom_beeswarm(cex = 3)

ggplot(geneCounts, aes(x = dex, y = count, color = cell, group = cell)) +
  scale_y_log10() + geom_point(size = 3) + geom_line()

### Note that the DESeq test actually takes into account the cell line effect, so this figure more closely depicts the difference being tested.

library("apeglm")
resultsNames(dds)
## [1] "Intercept"               "cell_N061011_vs_N052611"
## [3] "cell_N080611_vs_N052611" "cell_N61311_vs_N052611" 
## [5] "dex_trt_vs_untrt"
res <- lfcShrink(dds, coef="dex_trt_vs_untrt", type="apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
plotMA(res, ylim = c(-5, 5))

An MA-plot (Dudoit et al. 2002) provides a useful overview for the distribution of the estimated coefficients in the model, e.g. the comparisons of interest, across all genes

res.noshr <- results(dds, name="dex_trt_vs_untrt")
plotMA(res.noshr, ylim = c(-5, 5))

plotMA(res, ylim = c(-5,5))
topGene <- rownames(res)[which.min(res$padj)]
with(res[topGene, ], {
  points(baseMean, log2FoldChange, col="dodgerblue", cex=2, lwd=2)
  text(baseMean, log2FoldChange, topGene, pos=2, col="dodgerblue")
})

### Within the with function, only the baseMean and log2FoldChange values for the selected rows of res are used.

plotMA(res, ylim = c(-5,5))
topGene <- rownames(res)[which.min(res$padj)]
with(res[topGene, ], {
  points(baseMean, log2FoldChange, col="dodgerblue", cex=2, lwd=2)
  text(baseMean, log2FoldChange, topGene, pos=2, col="dodgerblue")
})

hist(res$pvalue[res$baseMean > 1], breaks = 0:20/20,
     col = "grey50", border = "white")

library("genefilter")
## 
## Attaching package: 'genefilter'
## The following objects are masked from 'package:matrixStats':
## 
##     rowSds, rowVars
topVarGenes <- head(order(rowVars(assay(vsd)), decreasing = TRUE), 20)

Treatment status and cell line information are shown with colored bars at the top of the heatmap. Blocks of genes that covary across patients. Note that a set of genes in the heatmap are separating the N061011 cell line from the others, and there is another set of genes for which the dexamethasone treated samples have higher gene expression.

mat  <- assay(vsd)[ topVarGenes, ]
mat  <- mat - rowMeans(mat)
anno <- as.data.frame(colData(vsd)[, c("cell","dex")])
pheatmap(mat, annotation_col = anno)

qs <- c(0, quantile(resLFC1$baseMean[resLFC1$baseMean > 0], 0:6/6))
bins <- cut(resLFC1$baseMean, qs)
levels(bins) <- paste0("~", round(signif((qs[-1] + qs[-length(qs)])/2, 2)))
fractionSig <- tapply(resLFC1$pvalue, bins, function(p)
                          mean(p < .05, na.rm = TRUE))
barplot(fractionSig, xlab = "mean normalized count",
                     ylab = "fraction of small p values")

In the following code chunk, we create bins using the quantile function, bin the genes by base mean using cut, rename the levels of the bins using the middle point, calculate the ratio of p values less than 0.05 for each bin, and finally plot these ratios (figure below).

qs <- c(0, quantile(resLFC1$baseMean[resLFC1$baseMean > 0], 0:6/6))
bins <- cut(resLFC1$baseMean, qs)
levels(bins) <- paste0("~", round(signif((qs[-1] + qs[-length(qs)])/2, 2)))
fractionSig <- tapply(resLFC1$pvalue, bins, function(p)
                          mean(p < .05, na.rm = TRUE))
barplot(fractionSig, xlab = "mean normalized count",
                     ylab = "fraction of small p values")

library("AnnotationDbi")
library("org.Hs.eg.db")
## 
columns(org.Hs.eg.db)
##  [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS"
##  [6] "ENTREZID"     "ENZYME"       "EVIDENCE"     "EVIDENCEALL"  "GENENAME"    
## [11] "GO"           "GOALL"        "IPI"          "MAP"          "OMIM"        
## [16] "ONTOLOGY"     "ONTOLOGYALL"  "PATH"         "PFAM"         "PMID"        
## [21] "PROSITE"      "REFSEQ"       "SYMBOL"       "UCSCKG"       "UNIGENE"     
## [26] "UNIPROT"
ens.str <- substr(rownames(res), 1, 15)
res$symbol <- mapIds(org.Hs.eg.db,
                     keys=ens.str,
                     column="SYMBOL",
                     keytype="ENSEMBL",
                     multiVals="first")
## 'select()' returned 1:many mapping between keys and columns
res$entrez <- mapIds(org.Hs.eg.db,
                     keys=ens.str,
                     column="ENTREZID",
                     keytype="ENSEMBL",
                     multiVals="first")
## 'select()' returned 1:many mapping between keys and columns

We can use the mapIds function to add individual columns to our results table. We provide the row names of our results table as a key, and specify that keytype=ENSEMBL. The column argument tells the mapIds function which information we want, and the multiVals argument tells the function what to do if there are multiple possible values for a single input value.

resOrdered <- res[order(res$pvalue),]
head(resOrdered)
## log2 fold change (MAP): dex trt vs untrt 
## Wald test p-value: dex trt vs untrt 
## DataFrame with 6 rows and 7 columns
##                     baseMean log2FoldChange     lfcSE       pvalue         padj
##                    <numeric>      <numeric> <numeric>    <numeric>    <numeric>
## ENSG00000189221.9   2373.805        3.38765  0.136985 1.84494e-137 3.11758e-133
## ENSG00000120129.5   3420.727        2.96335  0.120850 6.35042e-135 5.36547e-131
## ENSG00000101347.9  14125.584        3.74129  0.157927 2.88983e-127 1.62775e-123
## ENSG00000196136.17  2710.217        3.23518  0.143951 3.68488e-114 1.55668e-110
## ENSG00000152583.12   974.737        4.48641  0.201276 2.94551e-113 9.95466e-110
## ENSG00000211445.11 12512.792        3.75875  0.169536 2.36246e-112 6.65348e-109
##                         symbol      entrez
##                    <character> <character>
## ENSG00000189221.9         MAOA        4128
## ENSG00000120129.5        DUSP1        1843
## ENSG00000101347.9       SAMHD1       25939
## ENSG00000196136.17    SERPINA3          12
## ENSG00000152583.12     SPARCL1        8404
## ENSG00000211445.11        GPX3        2878

Exporting results

resOrderedDF <- as.data.frame(resOrdered)[1:100, ]
write.csv(resOrderedDF, file = "results.csv")

file:///Users/jovan/Genomics-Course/report/report.html

resGR <- lfcShrink(dds, coef="dex_trt_vs_untrt", type="apeglm", format="GRanges")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
resGR
## GRanges object with 31604 ranges and 5 metadata columns:
##                      seqnames              ranges strand |   baseMean
##                         <Rle>           <IRanges>  <Rle> |  <numeric>
##   ENSG00000000003.14     chrX 100627109-100639991      - | 739.940717
##   ENSG00000000419.12    chr20   50934867-50958555      - | 511.735722
##   ENSG00000000457.13     chr1 169849631-169894267      - | 314.194855
##   ENSG00000000460.16     chr1 169662007-169854080      + |  79.793622
##   ENSG00000000938.12     chr1   27612064-27635277      - |   0.307267
##                  ...      ...                 ...    ... .        ...
##    ENSG00000285979.1    chr16   57177349-57181390      + |  38.353886
##    ENSG00000285987.1     chr9   84316514-84657077      + |   1.562508
##    ENSG00000285990.1    chr14   19244904-19269380      - |   0.642315
##    ENSG00000285991.1     chr6 149817937-149896011      - |  11.276284
##    ENSG00000285994.1    chr10   12563151-12567351      + |   3.651041
##                      log2FoldChange     lfcSE      pvalue       padj
##                           <numeric> <numeric>   <numeric>  <numeric>
##   ENSG00000000003.14     -0.3360046  0.105909 0.000726392 0.00531137
##   ENSG00000000419.12      0.1783789  0.122440 0.108822318 0.29318870
##   ENSG00000000457.13      0.0299361  0.141095 0.811509461 0.92255697
##   ENSG00000000460.16     -0.0555061  0.222787 0.714430444 0.87298038
##   ENSG00000000938.12     -0.0115799  0.304740 0.695977205         NA
##                  ...            ...       ...         ...        ...
##    ENSG00000285979.1     0.15284386  0.257070    0.340940   0.600750
##    ENSG00000285987.1     0.02551527  0.300687    0.647996         NA
##    ENSG00000285990.1    -0.00018563  0.304465    0.915396         NA
##    ENSG00000285991.1    -0.01507882  0.283931    0.876275   0.952921
##    ENSG00000285994.1    -0.00684681  0.293399    0.928414         NA
##   -------
##   seqinfo: 25 sequences (1 circular) from hg38 genome
ens.str <- substr(names(resGR), 1, 15)
resGR$symbol <- mapIds(org.Hs.eg.db, ens.str, "SYMBOL", "ENSEMBL")
## 'select()' returned 1:many mapping between keys and columns
library("Gviz")
## Loading required package: grid

The following code chunk specifies a window of 1 million base pairs upstream and downstream from the gene with the smallest p value. We create a subset of our full results, for genes within the window. We add the gene symbol as a name if the symbol exists and is not duplicated in our subset.

window <- resGR[topGene] + 1e6
strand(window) <- "*"
resGRsub <- resGR[resGR %over% window]
naOrDup <- is.na(resGRsub$symbol) | duplicated(resGRsub$symbol)
resGRsub$group <- ifelse(naOrDup, names(resGRsub), resGRsub$symbol)

We create a vector specifying if the genes in this subset had a low value of padj.

status <- factor(ifelse(resGRsub$padj < 0.05 & !is.na(resGRsub$padj),
                        "sig", "notsig"))

We create an axis track specifying our location in the genome, a track that will show the genes and their names, colored by significance, and a data track that will draw vertical bars showing the moderated log fold change produced by DESeq2, which we know are only large when the effect is well supported by the information in the counts.

options(ucscChromosomeNames = FALSE)
g <- GenomeAxisTrack()
a <- AnnotationTrack(resGRsub, name = "gene ranges", feature = status)
d <- DataTrack(resGRsub, data = "log2FoldChange", baseline = 0,
               type = "h", name = "log2 fold change", strand = "+")
plotTracks(list(g, d, a), groupAnnotation = "group",
           notsig = "grey", sig = "hotpink")

library("sva")
## Loading required package: mgcv
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
## The following object is masked from 'package:IRanges':
## 
##     collapse
## This is mgcv 1.8-33. For overview type 'help("mgcv-package")'.
## Loading required package: BiocParallel

Below we obtain a matrix of normalized counts for which the average count across samples is larger than 1

dat  <- counts(dds, normalized = TRUE)
idx  <- rowMeans(dat) > 1
dat  <- dat[idx, ]
mod  <- model.matrix(~ dex, colData(dds))
mod0 <- model.matrix(~   1, colData(dds))
svseq <- svaseq(dat, mod, mod0, n.sv = 2)
## Number of significant surrogate variables is:  2 
## Iteration (out of 5 ):1  2  3  4  5
svseq$sv
##            [,1]        [,2]
## [1,]  0.2465669 -0.51599084
## [2,]  0.2588137 -0.59462876
## [3,]  0.1384516  0.24920662
## [4,]  0.2179075  0.37716083
## [5,] -0.6042910 -0.06305844
## [6,] -0.6138795 -0.03623320
## [7,]  0.1821306  0.30328185
## [8,]  0.1743002  0.28026195
par(mfrow = c(2, 1), mar = c(3,5,3,1))
for (i in 1:2) {
  stripchart(svseq$sv[, i] ~ dds$cell, vertical = TRUE, main = paste0("SV", i))
  abline(h = 0)
 }

we can see how well the SVA method did at recovering these variables

ddssva <- dds
ddssva$SV1 <- svseq$sv[,1]
ddssva$SV2 <- svseq$sv[,2]
design(ddssva) <- ~ SV1 + SV2 + dex

Finally, in order to use SVA to remove any effect on the counts from our surrogate variables, we simply add these two surrogate variables as columns to the DESeqDataSet and then add them to the design.

Using RUV with DESeq2

library("RUVSeq")
## Loading required package: EDASeq
## Loading required package: ShortRead
## Loading required package: Biostrings
## Loading required package: XVector
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
## 
##     strsplit
## Loading required package: Rsamtools
## Loading required package: GenomicAlignments
## 
## Attaching package: 'GenomicAlignments'
## The following object is masked from 'package:dplyr':
## 
##     last
## 
## Attaching package: 'ShortRead'
## The following objects are masked from 'package:Gviz':
## 
##     chromosome, position
## The following objects are masked from 'package:ReportingTools':
## 
##     basePath, name
## The following object is masked from 'package:dplyr':
## 
##     id
## The following object is masked from 'package:magrittr':
## 
##     functions
## Loading required package: edgeR
## Loading required package: limma
## 
## Attaching package: 'limma'
## The following object is masked from 'package:DESeq2':
## 
##     plotMA
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA
set <- newSeqExpressionSet(counts(dds))
idx  <- rowSums(counts(set) > 5) >= 2
set  <- set[idx, ]
set <- betweenLaneNormalization(set, which="upper")
not.sig <- rownames(res)[which(res$pvalue > .1)]
empirical <- rownames(set)[ rownames(set) %in% not.sig ]
set <- RUVg(set, empirical, k=2)
pData(set)
##                     W_1         W_2
## SRR1039508 -0.224881168  0.42992983
## SRR1039509 -0.249022928  0.53858506
## SRR1039512  0.001460949  0.01437385
## SRR1039513 -0.175547525  0.08408354
## SRR1039516  0.599387535 -0.02512358
## SRR1039517  0.590516825 -0.02549392
## SRR1039520 -0.241071948 -0.50369551
## SRR1039521 -0.300841739 -0.51265927
par(mfrow = c(2, 1), mar = c(3,5,3,1))
for (i in 1:2) {
  stripchart(pData(set)[, i] ~ dds$cell, vertical = TRUE, main = paste0("W", i))
  abline(h = 0)
 }

ddsruv <- dds
ddsruv$W1 <- set$W_1
ddsruv$W2 <- set$W_2
design(ddsruv) <- ~ W1 + W2 + dex

Time course experiments

library("fission")
data("fission")
ddsTC <- DESeqDataSet(fission, ~ strain + minute + strain:minute)

use a design formula that models the strain difference at time 0, the difference over time, and any strain-specific differences over time (the interaction term strain:minute).

ddsTC <- DESeq(ddsTC, test="LRT", reduced = ~ strain + minute)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
resTC <- results(ddsTC)
resTC$symbol <- mcols(ddsTC)$symbol
head(resTC[order(resTC$padj),], 4)
## log2 fold change (MLE): strainmut.minute180 
## LRT p-value: '~ strain + minute + strain:minute' vs '~ strain + minute' 
## DataFrame with 4 rows and 7 columns
##               baseMean log2FoldChange     lfcSE      stat      pvalue
##              <numeric>      <numeric> <numeric> <numeric>   <numeric>
## SPBC2F12.09c   174.671     -2.6567195  0.752261   97.2834 1.97415e-19
## SPAC1002.18    444.505     -0.0509321  0.204299   56.9536 5.16955e-11
## SPAC1002.19    336.373     -0.3927490  0.573494   43.5339 2.87980e-08
## SPAC1002.17c   261.773     -1.1387648  0.606129   39.3158 2.05137e-07
##                     padj      symbol
##                <numeric> <character>
## SPBC2F12.09c 1.33453e-15       atf21
## SPAC1002.18  1.74731e-07        urg3
## SPAC1002.19  6.48916e-05        urg1
## SPAC1002.17c 3.46682e-04        urg2
fiss <- plotCounts(ddsTC, which.min(resTC$padj), 
                   intgroup = c("minute","strain"), returnData = TRUE)
fiss$minute <- as.numeric(as.character(fiss$minute))
ggplot(fiss,
  aes(x = minute, y = count, color = strain, group = strain)) + 
  geom_point() + stat_summary(fun.y=mean, geom="line") +
  scale_y_log10()
## Warning: `fun.y` is deprecated. Use `fun` instead.

resultsNames(ddsTC)
##  [1] "Intercept"           "strain_mut_vs_wt"    "minute_15_vs_0"     
##  [4] "minute_30_vs_0"      "minute_60_vs_0"      "minute_120_vs_0"    
##  [7] "minute_180_vs_0"     "strainmut.minute15"  "strainmut.minute30" 
## [10] "strainmut.minute60"  "strainmut.minute120" "strainmut.minute180"
res30 <- results(ddsTC, name="strainmut.minute30", test="Wald")
res30[which.min(resTC$padj),]
## log2 fold change (MLE): strainmut.minute30 
## Wald test p-value: strainmut.minute30 
## DataFrame with 1 row and 6 columns
##               baseMean log2FoldChange     lfcSE      stat      pvalue      padj
##              <numeric>      <numeric> <numeric> <numeric>   <numeric> <numeric>
## SPBC2F12.09c   174.671       -2.60047  0.634343  -4.09947 4.14099e-05  0.279931
betas <- coef(ddsTC)
colnames(betas)
##  [1] "Intercept"           "strain_mut_vs_wt"    "minute_15_vs_0"     
##  [4] "minute_30_vs_0"      "minute_60_vs_0"      "minute_120_vs_0"    
##  [7] "minute_180_vs_0"     "strainmut.minute15"  "strainmut.minute30" 
## [10] "strainmut.minute60"  "strainmut.minute120" "strainmut.minute180"
topGenes <- head(order(resTC$padj),20)
mat <- betas[topGenes, -c(1,2)]
thr <- 3 
mat[mat < -thr] <- -thr
mat[mat > thr] <- thr
pheatmap(mat, breaks=seq(from=-thr, to=thr, length=101),
         cluster_col=FALSE)
## Found more than one class "unit" in cache; using the first, from namespace 'hexbin'
## Also defined by 'ggbio'
## Found more than one class "simpleUnit" in cache; using the first, from namespace 'hexbin'
## Also defined by 'ggbio'
## Found more than one class "simpleUnit" in cache; using the first, from namespace 'hexbin'
## Also defined by 'ggbio'

### The bottom set of genes show strong induction of expression for the baseline samples in minutes 15-60 (red boxes in the bottom left corner), but then have slight differences for the mutant strain (shown in the boxes in the bottom right corner).

Session information, which reports the version numbers of R and all the packages used in this session

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.7
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
##  [1] grid      parallel  stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] fission_1.8.0               RUVSeq_1.22.0              
##  [3] edgeR_3.30.3                limma_3.44.3               
##  [5] EDASeq_2.22.0               ShortRead_1.46.0           
##  [7] GenomicAlignments_1.24.0    Rsamtools_2.4.0            
##  [9] Biostrings_2.56.0           XVector_0.28.0             
## [11] sva_3.36.0                  BiocParallel_1.22.0        
## [13] mgcv_1.8-33                 nlme_3.1-149               
## [15] Gviz_1.32.0                 ReportingTools_2.28.0      
## [17] knitr_1.30                  org.Hs.eg.db_3.11.4        
## [19] genefilter_1.70.0           apeglm_1.10.0              
## [21] ggbeeswarm_0.6.0            glmpca_0.2.0               
## [23] PoiClaClu_1.0.2.1           RColorBrewer_1.1-2         
## [25] pheatmap_1.0.12             ggplot2_3.3.2              
## [27] dplyr_1.0.2                 vsn_3.56.0                 
## [29] DESeq2_1.28.1               magrittr_1.5               
## [31] GenomicFeatures_1.40.1      AnnotationDbi_1.50.3       
## [33] tximeta_1.6.3               airway_1.8.0               
## [35] SummarizedExperiment_1.18.2 DelayedArray_0.14.1        
## [37] matrixStats_0.57.0          Biobase_2.48.0             
## [39] GenomicRanges_1.40.0        GenomeInfoDb_1.24.2        
## [41] IRanges_2.22.2              S4Vectors_0.26.1           
## [43] BiocGenerics_0.34.0        
## 
## loaded via a namespace (and not attached):
##   [1] R.utils_2.10.1                tidyselect_1.1.0             
##   [3] RSQLite_2.2.1                 htmlwidgets_1.5.2            
##   [5] DESeq_1.39.0                  munsell_0.5.0                
##   [7] preprocessCore_1.50.0         withr_2.3.0                  
##   [9] colorspace_1.4-1              Category_2.54.0              
##  [11] OrganismDbi_1.30.0            rstudioapi_0.11              
##  [13] labeling_0.3                  tximport_1.16.1              
##  [15] bbmle_1.0.23.1                GenomeInfoDbData_1.2.3       
##  [17] hwriter_1.3.2                 bit64_4.0.5                  
##  [19] farver_2.0.3                  coda_0.19-4                  
##  [21] vctrs_0.3.4                   generics_0.0.2               
##  [23] xfun_0.18                     biovizBase_1.36.0            
##  [25] BiocFileCache_1.12.1          R6_2.4.1                     
##  [27] locfit_1.5-9.4                AnnotationFilter_1.12.0      
##  [29] bitops_1.0-6                  reshape_0.8.8                
##  [31] assertthat_0.2.1              promises_1.1.1               
##  [33] scales_1.1.1                  nnet_7.3-14                  
##  [35] beeswarm_0.2.3                gtable_0.3.0                 
##  [37] affy_1.66.0                   ggbio_1.36.0                 
##  [39] ensembldb_2.12.1              rlang_0.4.8                  
##  [41] splines_4.0.3                 rtracklayer_1.48.0           
##  [43] lazyeval_0.2.2                dichromat_2.0-0              
##  [45] hexbin_1.28.1                 checkmate_2.0.0              
##  [47] BiocManager_1.30.10           yaml_2.2.1                   
##  [49] reshape2_1.4.4                backports_1.1.10             
##  [51] httpuv_1.5.4                  Hmisc_4.4-1                  
##  [53] RBGL_1.64.0                   tools_4.0.3                  
##  [55] affyio_1.58.0                 ellipsis_0.3.1               
##  [57] Rcpp_1.0.5                    plyr_1.8.6                   
##  [59] base64enc_0.1-3               progress_1.2.2               
##  [61] zlibbioc_1.34.0               purrr_0.3.4                  
##  [63] RCurl_1.98-1.2                prettyunits_1.1.1            
##  [65] rpart_4.1-15                  openssl_1.4.3                
##  [67] cluster_2.1.0                 data.table_1.13.0            
##  [69] mvtnorm_1.1-1                 ProtGenerics_1.20.0          
##  [71] aroma.light_3.18.0            hms_0.5.3                    
##  [73] mime_0.9                      evaluate_0.14                
##  [75] xtable_1.8-4                  XML_3.99-0.5                 
##  [77] emdbook_1.3.12                jpeg_0.1-8.1                 
##  [79] gridExtra_2.3                 compiler_4.0.3               
##  [81] biomaRt_2.44.4                bdsmatrix_1.3-4              
##  [83] tibble_3.0.4                  crayon_1.3.4                 
##  [85] R.oo_1.24.0                   htmltools_0.5.0              
##  [87] GOstats_2.54.0                later_1.1.0.1                
##  [89] Formula_1.2-4                 geneplotter_1.66.0           
##  [91] DBI_1.1.0                     dbplyr_1.4.4                 
##  [93] MASS_7.3-53                   rappdirs_0.3.1               
##  [95] Matrix_1.2-18                 readr_1.4.0                  
##  [97] R.methodsS3_1.8.1             pkgconfig_2.0.3              
##  [99] numDeriv_2016.8-1.1           foreign_0.8-80               
## [101] xml2_1.3.2                    annotate_1.66.0              
## [103] vipor_0.4.5                   AnnotationForge_1.30.1       
## [105] stringr_1.4.0                 VariantAnnotation_1.34.0     
## [107] digest_0.6.26                 graph_1.66.0                 
## [109] rmarkdown_2.4                 htmlTable_2.1.0              
## [111] GSEABase_1.50.1               curl_4.3                     
## [113] shiny_1.5.0                   lifecycle_0.2.0              
## [115] jsonlite_1.7.1                PFAM.db_3.11.4               
## [117] askpass_1.1                   BSgenome_1.56.0              
## [119] pillar_1.4.6                  lattice_0.20-41              
## [121] GGally_2.0.0                  fastmap_1.0.1                
## [123] httr_1.4.2                    survival_3.2-7               
## [125] GO.db_3.11.4                  interactiveDisplayBase_1.26.3
## [127] glue_1.4.2                    png_0.1-7                    
## [129] BiocVersion_3.11.1            bit_4.0.4                    
## [131] Rgraphviz_2.32.0              stringi_1.5.3                
## [133] blob_1.2.1                    AnnotationHub_2.20.2         
## [135] latticeExtra_0.6-29           memoise_1.1.0